In [1]:
%pylab inline
In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import pandas as pd
In [3]:
import requests
from bs4 import BeautifulSoup
# soup = BeautifulSoup(html_doc)
In [4]:
r = requests.get('http://www.payscale.com/data-packages/most-and-least-meaningful-jobs/full-list')
In [5]:
r.status_code
Out[5]:
In [6]:
soup = BeautifulSoup(r.text)
In [8]:
#print soup.prettify()
#soup.find_all('table')
soup.table
In [3]:
url = 'http://www.payscale.com/data-packages/most-and-least-meaningful-jobs/full-list'
In [4]:
pd.io.html.read_html?
In [5]:
dflist = pd.io.html.read_html(url, match='.+', flavor=None, header=0, index_col=None, skiprows=None, infer_types=None, attrs=None, parse_dates=False, tupleize_cols=False, thousands=',')
In [6]:
df = dflist[0]
In [7]:
df.dtypes
Out[7]:
In [7]:
df.head()
Out[7]:
In [8]:
for c in df.columns:
df[c] = df[c].str.replace('%','')
In [9]:
df[df.columns[1]] = df[df.columns[1]].str.replace('$','')
In [10]:
df[df.columns[1]] = df[df.columns[1]].str.replace(',','')
In [11]:
df.head()
Out[11]:
In [12]:
df = df.convert_objects(convert_numeric=True)
In [13]:
df.dtypes
Out[13]:
In [14]:
pd.options.display.mpl_style = 'default'
In [15]:
figsize(10, 10)
In [20]:
plt.scatter(x=df['% High Stress'],y=df['% High Satisfaction'], s=df['Median Pay']*.001, alpha=0.5)
plt.gca().set_xlabel('% High Stress')
plt.gca().set_ylabel('% High Satisfaction')
plt.gca().set_title('High Satisfaction vs High Stress Jobs', fontsize=16)
plt.show()